The goal of this script is to generate a Seurat object for sample GSM3717034.

  • removal of cells based on quality control metrics
  • normalization with LogNormalize, then doublets detection using scran hybrid and scDblFinder method (but not filtering)
  • normalization with LogNormalize, for only the remaining cells
  • cell cycle and cell type annotation
  • dimensionality reduction using PCA
  • projection using tSNE and UMAP
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
## [1] "/usr/local/lib/R/library"

Preparation

In this section, we set the global settings of the analysis. We will store data there :

out_dir = "."

We load the parameters :

sample_name = params$sample_name # "GSM3717034"
# sample_name = "GSM3717034"

Input count matrix is there :

count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/")
count_matrix_file = list.files(count_matrix_dir, full.names = TRUE)
count_matrix_file
## [1] "./input/GSM3717034//GSM3717034_DS1-Rie_4_comb_clean.dge.txt.gz"

We load the markers and specific colors for each cell type :

cell_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               13               13                9               10 
##          B cells          cuticle           cortex          medulla 
##               16               15               16               10 
##              IRS    proliferative              IBL              ORS 
##               16               20               15               16 
##              IFE             HFSC      melanocytes        sebocytes 
##               17               17               10                8

Here are custom colors for each cell type :

color_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())

We load markers to display on the dotplot :

dotplot_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
## $`CD4 T cells`
## [1] "CD3E" "CD4" 
## 
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
## 
## $`Langerhans cells`
## [1] "CD207" "CPVL" 
## 
## $macrophages
## [1] "TREM2" "MSR1" 
## 
## $`B cells`
## [1] "CD79A" "CD79B"
## 
## $cuticle
## [1] "KRT32" "KRT35"
## 
## $cortex
## [1] "KRT31" "PRR9" 
## 
## $medulla
## [1] "BAMBI"   "ADLH1A3"
## 
## $IRS
## [1] "KRT71" "KRT73"
## 
## $proliferative
## [1] "TOP2A" "MCM5" 
## 
## $IBL
## [1] "KRT16" "KRT6C"
## 
## $ORS
## [1] "KRT15" "GPX2" 
## 
## $IFE
## [1] "SPINK5" "LY6D"  
## 
## $HFSC
## [1] "DIO2"   "TCEAL2"
## 
## $melanocytes
## [1] "DCT"   "MLANA"
## 
## $sebocytes
## [1] "CLMP"  "PPARG"

We load metadata for this sample :

sample_info = readRDS(paste0(out_dir, "/../1_metadata/takahashi_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
##   project_name  sample_type sample_identifier     color
## 1   GSM3717034 Takahashi_HD    Takahashi_HD_1 lawngreen

These is a parameter for different functions :

cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 0.5  # almost no filter
cut_nFeature_RNA = 20     # almost no filter
cut_percent.mt = 20
cut_percent.rb = 50

Load count matrix

In this section, we load the count matrix.

mat = read.table(count_matrix_file,
                 header = TRUE, row.names = 1)

# For the two 10X data, this is required
rownames(mat) = stringr::str_remove(rownames(mat),
                                    pattern = "hg19_")

# Seurat object
sobj = Seurat::CreateSeuratObject(counts = mat,
                                  project = sample_name,
                                  assay = "RNA")
rm(mat)
sobj
## An object of class Seurat 
## 12060 features across 3000 samples within 1 assay 
## Active assay: RNA (12060 features, 0 variable features)

(Time to run : 15.91 s)

We add the same columns as in metadata :

row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
## [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
## [4] "project_name"      "sample_identifier" "sample_type"

Before filtering

Normalization

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 12060 features across 3000 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)

Projection

We generate a tSNE to visualize cells before filtering.

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj
## An object of class Seurat 
## 12060 features across 3000 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Cell type

We annotate cells for cell type using Seurat::AddModuleScore function.

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               52              134              347              105 
##          B cells          cuticle           cortex          medulla 
##              127              224              161              214 
##              IRS    proliferative              IBL              ORS 
##               96               93              154              135 
##              IFE             HFSC      melanocytes        sebocytes 
##              511              352              110              185

(Time to run : 1.03 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle phase

We annotate cells for cell cycle phase using Seurat and cyclone.

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##  G1 G2M   S 
## 812 466 176
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##            
##              G1 G2M   S
##   G1        500 258 108
##   G2M       101  80  16
##   S         201 109  50
##   Undecided  10  19   2

(Time to run : 57.17 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")
sobj$log_nCount_RNA = log(sobj$nCount_RNA)

head(sobj@meta.data)
##              orig.ident nCount_RNA nFeature_RNA project_name sample_identifier
## GAGCCGATGTGC GSM3717034       5722         1680   GSM3717034    Takahashi_HD_1
## AGTCCATTGCTN GSM3717034       3287         1054   GSM3717034    Takahashi_HD_1
## ATATTATTGGCA GSM3717034       1914          894   GSM3717034    Takahashi_HD_1
## GCCCTACGATCG GSM3717034       1829          846   GSM3717034    Takahashi_HD_1
## AACCAATTAGCT GSM3717034       1633          785   GSM3717034    Takahashi_HD_1
## AGCTGTGTGGCT GSM3717034       2910         1058   GSM3717034    Takahashi_HD_1
##               sample_type score_CD4_T_cells score_CD8_T_cells
## GAGCCGATGTGC Takahashi_HD      -0.036368128       -0.03032228
## AGTCCATTGCTN Takahashi_HD      -0.018399124       -0.01047608
## ATATTATTGGCA Takahashi_HD      -0.020065856       -0.02895131
## GCCCTACGATCG Takahashi_HD      -0.007682242       -0.01400089
## AACCAATTAGCT Takahashi_HD      -0.020199867       -0.02454284
## AGCTGTGTGGCT Takahashi_HD      -0.028769986       -0.01862311
##              score_Langerhans_cells score_macrophages score_B_cells
## GAGCCGATGTGC            -0.02434058       -0.08960526   -0.19349544
## AGTCCATTGCTN            -0.02928325       -0.04330710   -0.11435294
## ATATTATTGGCA            -0.03322217       -0.04723650   -0.13033956
## GCCCTACGATCG            -0.02348157       -0.04200266   -0.12012804
## AACCAATTAGCT            -0.02881340       -0.03686086   -0.08707183
## AGCTGTGTGGCT            -0.02306607       -0.06475252   -0.13403591
##              score_cuticle  score_cortex score_medulla   score_IRS
## GAGCCGATGTGC    -0.1689950 -0.4718040437   -0.11920058  0.11574183
## AGTCCATTGCTN    -0.1253240 -0.4075978780   -0.04888838  0.14319348
## ATATTATTGGCA    -0.1341447 -0.0001220687    0.90254786  0.82817744
## GCCCTACGATCG    -0.1287068 -0.4274122086   -0.05804583  0.20253397
## AACCAATTAGCT    -0.1057634 -0.1250109982   -0.04030250 -0.09498712
## AGCTGTGTGGCT    -0.1576319 -0.4133027426   -0.05100236  0.30418639
##              score_proliferative   score_IBL    score_ORS  score_IFE
## GAGCCGATGTGC         -0.09259052  0.19457031  0.073112808  1.1958267
## AGTCCATTGCTN         -0.05028636 -0.07311066 -0.178796294  1.4241773
## ATATTATTGGCA         -0.06190647  0.10088608 -0.165343831 -0.3758492
## GCCCTACGATCG         -0.05787847 -0.05519282 -0.009003299  1.0639101
## AACCAATTAGCT          0.11990736 -0.03063805  0.030348515  0.9579954
## AGCTGTGTGGCT          0.03287891  0.12598597 -0.004139219  1.3349971
##               score_HFSC score_melanocytes score_sebocytes cell_type
## GAGCCGATGTGC -0.21816220        -0.3254230      -0.2337687       IFE
## AGTCCATTGCTN -0.25926402        -0.3536076      -0.1231439       IFE
## ATATTATTGGCA -0.07092577        -0.3363119      -0.1203773       IRS
## GCCCTACGATCG -0.03607841        -0.3337974      -0.1184161       IFE
## AACCAATTAGCT  0.01357552        -0.2995709      -0.1120403       IFE
## AGCTGTGTGGCT -0.06299204        -0.3595476      -0.1575885       IFE
##              Seurat.Phase cyclone.Phase percent.mt percent.rb log_nCount_RNA
## GAGCCGATGTGC           G1            G1  0.8563439   18.94443       8.652074
## AGTCCATTGCTN           G1            G1  0.4867660   18.52753       8.097731
## ATATTATTGGCA            S            G1  2.2988506   17.97283       7.556951
## GCCCTACGATCG           G1            G1  1.0934937   20.55768       7.511525
## AACCAATTAGCT            S            G1  2.2657685   19.71831       7.398174
## AGCTGTGTGGCT           G1            G1  2.5085911   16.94158       7.975908

We get the cell barcodes for the failing cells :

fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()

Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))

fsobj
## An object of class Seurat 
## 12060 features across 2439 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
## An object of class Seurat 
## 12060 features across 2439 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

We identify doublet cells :

fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)
## [1] 12060  2439
## 
## FALSE  TRUE 
##  1956   483 
## [12:02:06] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## 
## FALSE  TRUE 
##  2367    72 
## 
## FALSE  TRUE 
##  1932   507
fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)

(Time to run : 33.81 s)

We add the information in the non filtered Seurat object :

sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)

Quality control representation

We can visualize the 4 cells quality with a Venn diagram :

n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))

Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)

Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Number of features

To visualize the threshold for number of features, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)

Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)

Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)

Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the log_nCount_RNA by nFeature_RNA figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (doublets_consensus.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (scDblFinder.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (hybrid_score.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

Visualization as piechart

Do filtered cells belong to a particular cell type ?

sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")

Doublet cells status

We can compare doublet detection methods with a Venn diagram :

ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))

We visualize cells annotation for doublets :

plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)

What is the composition of doublet cells ? We just look at score for each cell type.

sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
show

Save

We could save this object before filtering (remove eval = FALSE) :

saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))

Filtering

We remove :

  • cells with a number of UMI lower than 0.5
  • cells expressing a number of genes lower than 20
  • cells having more than 20 % of UMI related to mitochondrial genes
  • cells having more than 50 % of UMI related to ribosomal genes

Note: We do not filter cells detected as doublets. Indeed, few genes and transcripts are detected per cell, and the best cells are therefore annotated as doublets.

sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb)))
sobj
## An object of class Seurat 
## 12060 features across 2421 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Post-filtering processing

Normalization

We normalize the count matrix for remaining cells :

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 12060 features across 2421 samples within 1 assay 
## Active assay: RNA (12060 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Projection

We perform a PCA :

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE and a UMAP with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))

Annotation

We annotate cells for cell type, with the new normalized expression matrix :

score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               36              260               97              156 
##          B cells          cuticle           cortex          medulla 
##              112              139              139              120 
##              IRS    proliferative              IBL              ORS 
##               44               41              141              102 
##              IFE             HFSC      melanocytes        sebocytes 
##              456              314               93              171

(Time to run : 0.98 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle

We annotate cells for cell cycle phase :

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##  G1 G2M   S 
## 784 425 164
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##            
##              G1 G2M   S
##   G1        478 234 105
##   G2M       107  77  22
##   S         185  99  37
##   Undecided  14  15   0

(Time to run : 49.21 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Clustering

We make a highly resolutive clustering :

sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2421
## Number of edges: 103171
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.5082
## Number of communities: 14
## Elapsed time: 0 seconds
table(sobj$seurat_clusters)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13 
## 328 295 272 268 248 176 153 146 143 136 114  60  42  40

Visualization

Cell type

We can visualize the cell type :

tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Cell cycle

We can visualize the cell cycle, from Seurat :

tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

We can visualize the cell cycle, from cyclone :

tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Clusters

We visualize the clustering :

tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Gene expression

We visualize all cell types markers on the tSNE :

markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)

Save

We save the annotated and filtered Seurat object :

saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))

R session

show
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5   patchwork_1.1.2 dplyr_1.0.7    
## 
## loaded via a namespace (and not attached):
##   [1] softImpute_1.4              graphlayouts_0.7.0         
##   [3] pbapply_1.4-2               lattice_0.20-41            
##   [5] haven_2.3.1                 vctrs_0.3.8                
##   [7] usethis_2.0.1               dynwrap_1.2.1              
##   [9] blob_1.2.1                  survival_3.2-13            
##  [11] prodlim_2019.11.13          dynutils_1.0.5             
##  [13] later_1.3.0                 DBI_1.1.1                  
##  [15] R.utils_2.11.0              SingleCellExperiment_1.8.0 
##  [17] rappdirs_0.3.3              uwot_0.1.8                 
##  [19] dqrng_0.2.1                 jpeg_0.1-8.1               
##  [21] zlibbioc_1.32.0             pspline_1.0-18             
##  [23] pcaMethods_1.78.0           mvtnorm_1.1-1              
##  [25] htmlwidgets_1.5.4           GlobalOptions_0.1.2        
##  [27] future_1.22.1               UpSetR_1.4.0               
##  [29] laeken_0.5.2                leiden_0.3.3               
##  [31] clustree_0.4.3              parallel_3.6.3             
##  [33] scater_1.14.6               irlba_2.3.3                
##  [35] DEoptimR_1.0-9              tidygraph_1.1.2            
##  [37] Rcpp_1.0.9                  readr_2.0.2                
##  [39] KernSmooth_2.23-17          carrier_0.1.0              
##  [41] promises_1.1.0              gdata_2.18.0               
##  [43] DelayedArray_0.12.3         limma_3.42.2               
##  [45] graph_1.64.0                RcppParallel_5.1.4         
##  [47] Hmisc_4.4-0                 fs_1.5.2                   
##  [49] RSpectra_0.16-0             fastmatch_1.1-0            
##  [51] ranger_0.12.1               digest_0.6.25              
##  [53] png_0.1-7                   sctransform_0.2.1          
##  [55] cowplot_1.0.0               DOSE_3.12.0                
##  [57] ggvenn_0.1.9                here_1.0.1                 
##  [59] TInGa_0.0.0.9000            ggraph_2.0.3               
##  [61] pkgconfig_2.0.3             GO.db_3.10.0               
##  [63] DelayedMatrixStats_1.8.0    gower_0.2.1                
##  [65] ggbeeswarm_0.6.0            iterators_1.0.12           
##  [67] DropletUtils_1.6.1          reticulate_1.26            
##  [69] clusterProfiler_3.14.3      SummarizedExperiment_1.16.1
##  [71] circlize_0.4.15             beeswarm_0.4.0             
##  [73] GetoptLong_1.0.5            xfun_0.35                  
##  [75] bslib_0.3.1                 zoo_1.8-10                 
##  [77] tidyselect_1.1.0            reshape2_1.4.4             
##  [79] purrr_0.3.4                 ica_1.0-2                  
##  [81] pcaPP_1.9-73                viridisLite_0.3.0          
##  [83] rtracklayer_1.46.0          rlang_1.0.2                
##  [85] hexbin_1.28.1               jquerylib_0.1.4            
##  [87] dyneval_0.9.9               glue_1.4.2                 
##  [89] RColorBrewer_1.1-2          matrixStats_0.56.0         
##  [91] stringr_1.4.0               lava_1.6.7                 
##  [93] europepmc_0.3               DESeq2_1.26.0              
##  [95] recipes_0.1.17              labeling_0.3               
##  [97] httpuv_1.5.2                class_7.3-17               
##  [99] BiocNeighbors_1.4.2         DO.db_2.9                  
## [101] annotate_1.64.0             jsonlite_1.7.2             
## [103] XVector_0.26.0              bit_4.0.4                  
## [105] mime_0.9                    aquarius_0.1.5             
## [107] Rsamtools_2.2.3             gridExtra_2.3              
## [109] gplots_3.0.3                stringi_1.4.6              
## [111] processx_3.5.2              gsl_2.1-6                  
## [113] bitops_1.0-6                cli_3.0.1                  
## [115] batchelor_1.2.4             RSQLite_2.2.0              
## [117] randomForest_4.6-14         tidyr_1.1.4                
## [119] data.table_1.14.2           rstudioapi_0.13            
## [121] org.Mm.eg.db_3.10.0         GenomicAlignments_1.22.1   
## [123] nlme_3.1-147                qvalue_2.18.0              
## [125] scran_1.14.6                locfit_1.5-9.4             
## [127] scDblFinder_1.1.8           listenv_0.8.0              
## [129] ggthemes_4.2.4              gridGraphics_0.5-0         
## [131] R.oo_1.24.0                 dbplyr_1.4.4               
## [133] BiocGenerics_0.32.0         TTR_0.24.2                 
## [135] readxl_1.3.1                lifecycle_1.0.1            
## [137] timeDate_3043.102           ggpattern_0.3.1            
## [139] munsell_0.5.0               cellranger_1.1.0           
## [141] R.methodsS3_1.8.1           proxyC_0.1.5               
## [143] visNetwork_2.0.9            caTools_1.18.0             
## [145] codetools_0.2-16            Biobase_2.46.0             
## [147] GenomeInfoDb_1.22.1         vipor_0.4.5                
## [149] lmtest_0.9-38               msigdbr_7.5.1              
## [151] htmlTable_1.13.3            triebeard_0.3.0            
## [153] lsei_1.2-0                  xtable_1.8-4               
## [155] ROCR_1.0-7                  BiocManager_1.30.10        
## [157] scatterplot3d_0.3-41        abind_1.4-5                
## [159] farver_2.0.3                parallelly_1.28.1          
## [161] RANN_2.6.1                  askpass_1.1                
## [163] GenomicRanges_1.38.0        RcppAnnoy_0.0.16           
## [165] tibble_3.1.5                ggdendro_0.1-20            
## [167] cluster_2.1.0               future.apply_1.5.0         
## [169] Seurat_3.1.5                dendextend_1.15.1          
## [171] Matrix_1.3-2                ellipsis_0.3.2             
## [173] prettyunits_1.1.1           lubridate_1.7.9            
## [175] ggridges_0.5.2              igraph_1.2.5               
## [177] RcppEigen_0.3.3.7.0         fgsea_1.12.0               
## [179] remotes_2.4.2               scBFA_1.0.0                
## [181] destiny_3.0.1               VIM_6.1.1                  
## [183] testthat_3.1.0              htmltools_0.5.2            
## [185] BiocFileCache_1.10.2        yaml_2.2.1                 
## [187] utf8_1.1.4                  plotly_4.9.2.1             
## [189] XML_3.99-0.3                ModelMetrics_1.2.2.2       
## [191] e1071_1.7-3                 foreign_0.8-76             
## [193] withr_2.5.0                 fitdistrplus_1.0-14        
## [195] BiocParallel_1.20.1         xgboost_1.4.1.1            
## [197] bit64_4.0.5                 foreach_1.5.0              
## [199] robustbase_0.93-9           Biostrings_2.54.0          
## [201] GOSemSim_2.13.1             rsvd_1.0.3                 
## [203] memoise_2.0.0               evaluate_0.18              
## [205] forcats_0.5.0               rio_0.5.16                 
## [207] geneplotter_1.64.0          tzdb_0.1.2                 
## [209] caret_6.0-86                ps_1.6.0                   
## [211] DiagrammeR_1.0.6.1          curl_4.3                   
## [213] fdrtool_1.2.15              fansi_0.4.1                
## [215] highr_0.8                   urltools_1.7.3             
## [217] xts_0.12.1                  GSEABase_1.48.0            
## [219] acepack_1.4.1               edgeR_3.28.1               
## [221] checkmate_2.0.0             scds_1.2.0                 
## [223] cachem_1.0.6                npsurv_0.4-0               
## [225] babelgene_22.3              rjson_0.2.20               
## [227] openxlsx_4.1.5              ggrepel_0.9.1              
## [229] clue_0.3-60                 rprojroot_2.0.2            
## [231] stabledist_0.7-1            tools_3.6.3                
## [233] sass_0.4.0                  nichenetr_1.1.1            
## [235] magrittr_2.0.1              RCurl_1.98-1.2             
## [237] proxy_0.4-24                car_3.0-11                 
## [239] ape_5.3                     ggplotify_0.0.5            
## [241] xml2_1.3.2                  httr_1.4.2                 
## [243] assertthat_0.2.1            rmarkdown_2.18             
## [245] boot_1.3-25                 globals_0.14.0             
## [247] R6_2.4.1                    Rhdf5lib_1.8.0             
## [249] nnet_7.3-14                 RcppHNSW_0.2.0             
## [251] progress_1.2.2              genefilter_1.68.0          
## [253] statmod_1.4.34              gtools_3.8.2               
## [255] shape_1.4.6                 HDF5Array_1.14.4           
## [257] BiocSingular_1.2.2          rhdf5_2.30.1               
## [259] splines_3.6.3               AUCell_1.8.0               
## [261] carData_3.0-4               colorspace_1.4-1           
## [263] generics_0.1.0              stats4_3.6.3               
## [265] base64enc_0.1-3             dynfeature_1.0.0           
## [267] smoother_1.1                gridtext_0.1.1             
## [269] pillar_1.6.3                tweenr_1.0.1               
## [271] sp_1.4-1                    ggplot.multistats_1.0.0    
## [273] rvcheck_0.1.8               GenomeInfoDbData_1.2.2     
## [275] plyr_1.8.6                  gtable_0.3.0               
## [277] zip_2.2.0                   knitr_1.41                 
## [279] ComplexHeatmap_2.14.0       latticeExtra_0.6-29        
## [281] biomaRt_2.42.1              IRanges_2.20.2             
## [283] fastmap_1.1.0               ADGofTest_0.3              
## [285] copula_1.0-0                doParallel_1.0.15          
## [287] AnnotationDbi_1.48.0        vcd_1.4-8                  
## [289] babelwhale_1.0.1            openssl_1.4.1              
## [291] scales_1.1.1                backports_1.2.1            
## [293] S4Vectors_0.24.4            ipred_0.9-12               
## [295] enrichplot_1.6.1            hms_1.1.1                  
## [297] ggforce_0.3.1               Rtsne_0.15                 
## [299] shiny_1.7.1                 numDeriv_2016.8-1.1        
## [301] polyclip_1.10-0             grid_3.6.3                 
## [303] lazyeval_0.2.2              Formula_1.2-3              
## [305] tsne_0.1-3                  crayon_1.3.4               
## [307] MASS_7.3-54                 pROC_1.16.2                
## [309] viridis_0.5.1               dynparam_1.0.0             
## [311] rpart_4.1-15                zinbwave_1.8.0             
## [313] compiler_3.6.3              ggtext_0.1.0
---
params:
  sample_name: "GSM3717034"
title: "GSE129611 dataset"
subtitle: "Sample `r params$sample_name`"
author: "Audrey"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output:
  html_document:
    code_folding: show
    code_download: true
    toc: true
    toc_float: true
    number_sections: false
---

<style>
body {
text-align: justify}
</style>

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo=FALSE}
# https://github.com/rstudio/rmarkdown/issues/1453
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot"),
  time_it = local({
    now = NULL
    function(before, options) {
      if (options$time_it) {
        if (before) {
          now <<- Sys.time()
        } else {
          res = difftime(Sys.time(), now, units = "secs")
          paste("(Time to run :", round(res, digits = 2), "s)")
        }
      }
    }
  })
)
```

<!-- Set default parameters for all chunks -->
```{r, setup, include = FALSE}
set.seed(1337L)
knitr::opts_chunk$set(echo = TRUE, # display code
                      # display chunk output
                      message = FALSE,
                      warning = FALSE,
                      fold_output = FALSE, # usefull for sessionInfo()
                      fold_plot = FALSE,
                      
                      # figure settings
                      fig.align = 'center',
                      fig.width = 20,
                      fig.height = 15,
                      
                      # something about seed, chunk and Rmarkdown compilation
                      # https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
                      # cache = TRUE,
                      cache.lazy = FALSE, 
                      
                      # add runtime after chunk
                      time_it = FALSE)
```


The goal of this script is to generate a Seurat object for sample `r params$sample_name`.

* removal of cells based on quality control metrics
* normalization with `LogNormalize`, then doublets detection using `scran hybrid` and `scDblFinder` method (but not filtering)
* normalization with `LogNormalize`, for only the remaining cells
* cell cycle and cell type annotation
* dimensionality reduction using `PCA`
* projection using `tSNE` and `UMAP`


```{r library}
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
```

# Preparation

In this section, we set the global settings of the analysis. We will store data there :

```{r out_dir}
out_dir = "."
```

We load the parameters :

```{r get_param}
sample_name = params$sample_name # "GSM3717034"
# sample_name = "GSM3717034"
```

Input count matrix is there :

```{r count_matrix_dir}
count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/")
count_matrix_file = list.files(count_matrix_dir, full.names = TRUE)
count_matrix_file
```


We load the markers and specific colors for each cell type :

```{r cell_markers}
cell_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
```

Here are custom colors for each cell type :

```{r color_markers, fig.width = 12, fig.height = 1, class.source = "fold-hide"}
color_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())
```


We load markers to display on the dotplot :

```{r dotplot_markers}
dotplot_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
```


We load metadata for this sample :

```{r sample_info}
sample_info = readRDS(paste0(out_dir, "/../1_metadata/takahashi_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
```


These is a parameter for different functions :

```{r global_settings}
cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 0.5  # almost no filter
cut_nFeature_RNA = 20     # almost no filter
cut_percent.mt = 20
cut_percent.rb = 50
```

# Load count matrix

In this section, we load the count matrix.

```{r load_count_matrix, time_it = TRUE}
mat = read.table(count_matrix_file,
                 header = TRUE, row.names = 1)

# For the two 10X data, this is required
rownames(mat) = stringr::str_remove(rownames(mat),
                                    pattern = "hg19_")

# Seurat object
sobj = Seurat::CreateSeuratObject(counts = mat,
                                  project = sample_name,
                                  assay = "RNA")
rm(mat)
sobj
```


We add the same columns as in metadata :

```{r add_metadata}
row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
```

# Before filtering

## Normalization

```{r normalization}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We generate a tSNE to visualize cells before filtering.

```{r pca_before, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE with 20 principal components :

```{r tsne_before}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj
```

## Cell type

We annotate cells for cell type using `Seurat::AddModuleScore` function.

```{r cell_annot_custom_short, time_it = TRUE}
sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
```

To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short, fig.width = 12, fig.height = 9}
markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```

## Cell cycle phase

We annotate cells for cell cycle phase using `Seurat` and `cyclone`.

```{r cell_cycle, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```

We visualize cell cycle on the projection :

```{r see_cell_cycle1, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```

# Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

```{r qc_metrics}
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")
sobj$log_nCount_RNA = log(sobj$nCount_RNA)

head(sobj@meta.data)
```

We get the cell barcodes for the failing cells :

```{r failed}
fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()
```


## Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

```{r fsobj}
fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))

fsobj
```

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

```{r normalization_doublets}
fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
```

We identify doublet cells :

```{r doublet_detection, time_it = TRUE}
fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)

fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)
```

We add the information in the non filtered Seurat object :

```{r add_doublet_metrics}
sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)
```

```{r clean_fsobj, echo = FALSE}
rm(fsobj)
```


## Quality control representation

We can visualize the 4 cells quality with a Venn diagram : 

```{r qc_venn, class.source = "fold-hide", fig.width = 8, fig.height = 6}
n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))
```


### Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

```{r qc_umi_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)
```

```{r vln_umi_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_umi_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Number of features

To visualize the threshold for number of features, we can make a histogram :

```{r qc_features_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)
```


```{r vln_features_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_features_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

```{r qc_mito_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)
```

```{r vln_percentmt_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_mito_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

```{r qc_ribo_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)
```


```{r vln_percentrb_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_ribo_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the `log_nCount_RNA` by `nFeature_RNA` figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

```{r qc_patchwork_cell_type, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

```{r qc_patchwork_mito, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

```{r qc_patchwork_ribo, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`doublets_consensus.class`) :

```{r qc_patchwork_doublet_consensus, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`scDblFinder.class`) :

```{r qc_patchwork_scdblfinder, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`hybrid_score.class`) :

```{r qc_patchwork_hybrid_score, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```


### Visualization as piechart

Do filtered cells belong to a particular cell type ?

```{r qc_piechart_cell_type, class.source = "fold-hide", fig.width = 12, fig.height = 9}
sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

```{r clean_qc_4, echo = FALSE}
sobj$all_cells = NULL

rm(plot_list, df)
```


### Doublet cells status

We can compare doublet detection methods with a Venn diagram : 

```{r qc_venn_doublet, class.source = "fold-hide", fig.width = 8, fig.height = 6}
ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
```

We visualize cells annotation for doublets :

```{r doublets_proj, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)
```

What is the composition of doublet cells ? We just look at score for each cell type.

```{r doublets_composition, class.source = "fold-hide"}
sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
```

```{r doublets_patchwork_prep, class.source = "fold-hide"}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
```


```{r doublets_patchwork, fig.width = 12, fig.height = 80, fold_plot = TRUE}
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

## Save

We could save this object before filtering (remove `eval = FALSE`) :

```{r save_sobj_unfiltered_annotated, eval = FALSE}
saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))
```


# Filtering

We remove :

* cells with a number of UMI lower than `r cut_log_nCount_RNA`
* cells expressing a number of genes lower than `r cut_nFeature_RNA`
* cells having more than `r cut_percent.mt` \% of UMI related to mitochondrial genes
* cells having more than `r cut_percent.rb` \% of UMI related to ribosomal genes

**Note**: We do not filter cells detected as doublets. Indeed, few genes and transcripts are detected per cell, and the best cells are therefore annotated as doublets.

```{r filter_cells}
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb)))
sobj
```

```{r clean_filter, echo = FALSE}
rm(fail_doublets_consensus, fail_doublets_scDblFinder, fail_doublets_hybrid,
   fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA,
   cut_percent.mt, cut_percent.rb, cut_log_nCount_RNA, cut_nFeature_RNA)
```


# Post-filtering processing

## Normalization

We normalize the count matrix for remaining cells :

```{r normalization_3}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We perform a PCA :

```{r pca, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE and a UMAP with 20 principal components :

```{r tsne_umap}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"),
                       check_duplicates = FALSE)

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))
```


## Annotation

We annotate cells for cell type, with the new normalized expression matrix :

```{r cell_type_2, time_it = TRUE}
score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
```


To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short2, fig.width = 12, fig.height = 9}
markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype2, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```


### Cell cycle

We annotate cells for cell cycle phase :

```{r cell_cycle2, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```


We visualize cell cycle on the projection :

```{r see_cell_cycle2, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```


## Clustering

We make a highly resolutive clustering :

```{r clustering}
sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)

table(sobj$seurat_clusters)
```


# Visualization

## Cell type

We can visualize the cell type :

```{r see_cell_type, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```

## Cell cycle

We can visualize the cell cycle, from Seurat :

```{r see_cc_Seurat, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


We can visualize the cell cycle, from cyclone :

```{r see_cc_cyclone, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Clusters

We visualize the clustering :

```{r see_clusters, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Gene expression

We visualize all cell types markers on the tSNE :

```{r plot_genes, fig.width = 12, fig.height = 20}
markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)
```


# Save

We save the annotated and filtered Seurat object :

```{r save_sobj_filtered_annotated}
saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))
```


# R session

```{r sessioninfo, echo = FALSE, fold_output = TRUE}
sessionInfo()
```
